"""实验一：中文分词 统计三国演义出场人物次数前5名"""

import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 1 读文件 得到一个字符串
with open("三国演义.txt", encoding="utf-8") as f:
    txt = f.read()
# 2 把字符串通过jieba库的lcut函数进行分词 得到一个列表
word_list = jieba.lcut(txt)
# print(word_list[:200])
# 3 把相同的词进行统计出现的次数 保存到一个字典 按次数降序排序
# d = {}
l = []
exclude = {"将军", "二人", "荆州", "不可", "却说", "不能", "如此"}
for word in word_list:
    # 长度为1的词 或者 固定的长度不为1的词 丢掉
    if len(word) == 1 or word in exclude:
        continue
    # 把指向同一个的不同词合并起来
    if word in {"玄德", "玄德曰"}:
        word = "刘备"
    elif word in {"孟德", "丞相"}:
        word = "曹操"
    elif word in {"云长", "关公"}:
        word = "关羽"
    elif word in {"孔明", "孔明曰"}:
        word = "诸葛亮"
    l.append(word)
    # if word not in d:
    #     d[word] = 1
    # else:
    #     d[word] = d[word] + 1
# 对字典按value值（次数）降序排列
# sort_list = sorted(d.items(), key=lambda item: item[1], reverse=True)

# print(sort_list)
# 4 前5个出场次数的人物打印出来：曹操出现了2000次
# for item in sort_list[:5]:
#     print(f"{item[0]}出现了{item[1]}次")
wc = WordCloud(font_path=r"C:\Windows\Fonts\simsun.ttc",width=1000,height=800,background_color="white")
img = wc.generate(" ".join(l))
plt.imshow(img)
plt.show()